library(readxl)
library(DataExplorer)
library(reshape2)
library(ggplot2)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(EnvStats)
##
## Attaching package: 'EnvStats'
## The following objects are masked from 'package:stats':
##
## predict, predict.lm
## The following object is masked from 'package:base':
##
## print.default
library(gridExtra)
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
library(lubridate)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
library(tidyverse)
## ── Attaching packages
## ───────────────────────────────────────
## tidyverse 1.3.2 ──
## ✔ tibble 3.1.8 ✔ purrr 0.3.4
## ✔ tidyr 1.2.0 ✔ stringr 1.4.1
## ✔ readr 2.1.2 ✔ forcats 0.5.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ lubridate::as.difftime() masks base::as.difftime()
## ✖ gridExtra::combine() masks dplyr::combine()
## ✖ lubridate::date() masks base::date()
## ✖ dplyr::filter() masks stats::filter()
## ✖ lubridate::intersect() masks base::intersect()
## ✖ dplyr::lag() masks stats::lag()
## ✖ lubridate::setdiff() masks base::setdiff()
## ✖ lubridate::union() masks base::union()
library(rospca)
library(knitr)
#Data Cleaning \(Note:\) This is the first code to be run for this analysis.
#Using 'Perchlorate codebook.pdf' as data dictionary
### Additional Notes from Mark Remiker, August 2022 ###
#Num_in_household
#- No data entry error here - that’s the response!
#Num_children
#- Same! That’s the actual response.
#Armed_forces
#- 1 = Yes, 2 = No, 3 = Don’t know, 4 = Refuse
#- 61 = Yes, 62 = No, 63 = Don’t know, 64 = Refuse
unique(Metals$Armed_forces)
## [1] 1 62 2 NA
Metals$Armed_forces[Metals$Armed_forces==62] <- 2
unique(Metals$Armed_forces)
## [1] 1 2 NA
#Employment
#- 1 = Working for pay, 2= Self-employed, 3= Looking for work, 4= Temporarily laid off, 5 = Retired, 6= A homemaker, 7= Student, 8= Permanently disabled, 9= Maternity/sick leave, 10= Other, 11= Don’t know, 12 = Refused
#Health_care
#- 21= Yes, 22= No, 24= I don’t know, 23= Refuse
unique(Metals$Health_care)
## [1] 1 21 2 22 3 4 NA
Metals$Health_care[Metals$Health_care==21] <- 1
Metals$Health_care[Metals$Health_care==22] <- 2
unique(Metals$Health_care)
## [1] 1 2 3 4 NA
#Health_consult
#- 21= Yes, 22= No, 24= I don’t know, 23= Refuse
unique(Metals$Health_consult)
## [1] 1 31 3 2 4 33 32 34 35 NA
Metals$Health_consult[Metals$Health_consult==31] <- 1
Metals$Health_consult[Metals$Health_consult==32] <- 2
Metals$Health_consult[Metals$Health_consult==33] <- 3
Metals$Health_consult[Metals$Health_consult==34] <- 4
Metals$Health_consult[Metals$Health_consult==35] <- 5
unique(Metals$Health_consult)
## [1] 1 3 2 4 5 NA
#Health_consult_place
#- Blank and “.” - Missing data
#Health options/Medical conditions
#- 1= Yes, 2= No, 3= Don’t know, 4= Refused
#HTN, HTN_hx_meds, Cholesterol, Cholesterol_meds
#- 5= Yes, 6= No, 7= Don’t know
unique(Metals$HTN)
## [1] 1 6 5 2 3 7 NA
Metals$HTN[Metals$HTN==5] <- 1
Metals$HTN[Metals$HTN==6] <- 2
Metals$HTN[Metals$HTN==7] <- 3
unique(Metals$HTN)
## [1] 1 2 3 NA
unique(Metals$HTN_hx_meds)
## [1] "1" "." "5" "6" "2" NA
Metals$HTN_hx_meds[Metals$HTN_hx_meds==5] <- 1
Metals$HTN_hx_meds[Metals$HTN_hx_meds==6] <- 2
unique(Metals$HTN_hx_meds)
## [1] "1" "." "2" NA
unique(Metals$Cholesterol)
## [1] 1 6 5 2 7 3 NA
Metals$Cholesterol[Metals$Cholesterol==5] <- 1
Metals$Cholesterol[Metals$Cholesterol==6] <- 2
Metals$Cholesterol[Metals$Cholesterol==7] <- 3
unique(Metals$Cholesterol)
## [1] 1 2 3 NA
#Diabetes
#- 31= Yes, 32= No, 33= Prediabetes 35= Don't know
unique(Metals$Diabetes)
## [1] 2 32 31 1 3 33 35 NA
Metals$Diabetes[Metals$Diabetes==31] <- 1
Metals$Diabetes[Metals$Diabetes==32] <- 3
Metals$Diabetes[Metals$Diabetes==33] <- 2
Metals$Diabetes[Metals$Diabetes==35] <- 4
unique(Metals$Diabetes)
## [1] 2 3 1 4 NA
#Dm_age
#- “.” = missing data, no response.
#Endo: does the participant have endocrinology data
#- .00 = No, 1.00 = yes
unique(Metals$endo)
## [1] 1 0 NA
#These are correct here.
#EMR
#- 0= No, 1= Yes
#Generally "." is missing data
Metals2 <- Metals %>%
select(-c(Date, DOB))
Metals2[Metals2=="."] <- NA
Metals3 <- Metals %>%
select(c(Date, DOB))
Metals <- cbind(Metals2, Metals3)
remove(Metals2)
remove(Metals3)
#Participant C1045 was 9/4/2018
Metals$Date[Metals$Date=="2013-01-04"] <- "2018-09-04"
Looks like this was converted to just 1=yes, 0=no.
This project will consider the associations between concentration levels of metals and metalloids measured in hair samples of 330 Yuma County residents in 2018 and the prevalence of negative health outcomes, including endocrine disruption, reproductive disorders, obesity, and neurological disorders.
Note: Before running this code, Data_Clean.Rmd should be run to clean the dataset used in this analysis.
#create_report(Metals) (Remove the # when you actually want to run - takes too much processing power otherwise.)
#The initial columns are for Mn55, Cu65, Cd111, Hg202, Pb208, and U238.
#There are additional columns with metal concentrations for arsenic and iron (As_75 and Fe_57) that come later after the hormone level columns.
#The differences in naming format make me think these were analyzed either at different times or on different machines or with different techniques, perhaps?
#Going to make things uniform for posterity.
colnames(Metals)[colnames(Metals) == "As_75"] <- "As75"
colnames(Metals)[colnames(Metals) == "Fe_57"] <- "Fe57"
data.melt <- melt(Metals,id.vars=c('ID','Date'), measure.vars=c('Mn55','Cu65','Cd111', 'Hg202', 'Pb208', 'U238', 'As75', 'Fe57'))
ggplot(data.melt) +
geom_boxplot(aes(y=log(value), fill=variable))+
theme(axis.ticks.x = element_blank(),
axis.text.x = element_blank(),
panel.background = element_rect(fill="#ffffff"),
axis.line = element_line(color="#000000"))+
ylab("Ln Concentration (ug/g)")+
labs(fill='Metals')
## Warning in log(value): NaNs produced
## Warning in log(value): NaNs produced
## Warning: Removed 1432 rows containing non-finite values (stat_boxplot).
#There is a single data point for the year 2013, with the rest being April-November 2018.
#This is likely either a mistake in input or some other error, but I will not include this point in subsequent analysis.
#The concentration values for this single row are blank anyway.
ggplot(data.melt, aes(y=log(value), x=Date, color=variable))+
geom_point()+
geom_smooth(se=F)+
theme(panel.background = element_rect(fill="#ffffff"),
axis.text.x=element_text(angle=60, hjust=1),
axis.line = element_line(color="#000000"))+
scale_x_date(date_breaks = "1 month", date_labels = "%b")+
ylab("Ln Concentration (Units)")+
xlab("Date of Sample Collection, 2018")+
labs(color='Metals')
## Warning in log(value): NaNs produced
## Warning in log(value): NaNs produced
## Warning in log(value): NaNs produced
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## Warning: Removed 1455 rows containing non-finite values (stat_smooth).
## Warning: Removed 1455 rows containing missing values (geom_point).
#Relevant paper: https://www.sciencedirect.com/science/article/pii/S1878535212000263
data.melt %>%
group_by(variable) %>%
summarize(number_NAs=sum(is.na(value)))
## # A tibble: 8 × 2
## variable number_NAs
## <fct> <int>
## 1 Mn55 123
## 2 Cu65 123
## 3 Cd111 123
## 4 Hg202 212
## 5 Pb208 123
## 6 U238 123
## 7 As75 283
## 8 Fe57 310
#Seeing if there are negative concentrations
data.melt %>%
group_by(variable) %>%
summarize(n_negative=sum(value<0, na.rm = T))
## # A tibble: 8 × 2
## variable n_negative
## <fct> <int>
## 1 Mn55 0
## 2 Cu65 0
## 3 Cd111 0
## 4 Hg202 0
## 5 Pb208 0
## 6 U238 0
## 7 As75 12
## 8 Fe57 0
#There are twelve negative concentration levels of arsenic. This will mess with taking the lognormal to transform.
#Need to look up what to do when concentrations in a dataset are negative. Do you ignore? Do you replace with DL/sqrt(2)? Do you leave them as is and it's just tough luck?
#For now I will simply ignore!
#Making a table re: metal concentration values
data.melt$value <- as.numeric(data.melt$value)
knitr::kable(data.melt %>%
filter(value>=0) %>% #This is ignoring negative concentrations in dataset, of which As75 has twelve.
group_by(variable) %>%
summarize(n=sum(!is.na(value)),
geomean=round(geoMean(value, na.rm = T), digits = 2),
geosd=round(geoSD(value, na.rm = T), digits = 2)))
| variable | n | geomean | geosd |
|---|---|---|---|
| Mn55 | 207 | 0.97 | 4.10 |
| Cu65 | 207 | 22.90 | 4.87 |
| Cd111 | 207 | 0.07 | 2.18 |
| Hg202 | 118 | 0.28 | 4.07 |
| Pb208 | 207 | 0.48 | 3.95 |
| U238 | 207 | 0.11 | 3.33 |
| As75 | 35 | 0.02 | 3.23 |
| Fe57 | 20 | 178.99 | 3.73 |
#create_report(data.melt) (Only run if you need to.)
hist(Metals$Mn55)
hist(Metals$Cu65)
hist(Metals$Cd111)
hist(Metals$Hg202)
hist(Metals$Pb208)
hist(Metals$U238)
hist(Metals$As75)
#If negative values are excluded, lognormal is appropriate.
hist(Metals$Fe57)
#Looking at total levels first before free
TSH <- ggplot(Metals) +geom_boxplot(aes(y=TSH)) +theme(axis.ticks.x = element_blank(),axis.text.x = element_blank()) +theme_classic() +ylab("TSH (units)")
T3 <- ggplot(Metals) +geom_boxplot(aes(y=TotalT3)) +theme(axis.ticks.x = element_blank(),axis.text.x = element_blank()) +theme_classic() +ylab("Total T3 (units)")
T4 <- ggplot(Metals) +geom_boxplot(aes(y=TotalT4)) +theme(axis.ticks.x = element_blank(),axis.text.x = element_blank()) +theme_classic() +ylab("Total T4 (units)")
Cortisol <- ggplot(Metals) +geom_boxplot(aes(y=Cortisol)) +theme(axis.ticks.x = element_blank(),axis.text.x = element_blank()) +theme_classic() +ylab("Cortisol (units)")
grid.arrange(TSH, Cortisol, T3, T4)
#Looking at distributions
hist(Metals$TSH)
hist(Metals$TotalT3)
hist(Metals$TotalT4)
hist(Metals$Cortisol)
#Levels by date
data.melt2 <- melt(Metals,id.vars=c('ID','Date'), measure.vars=c('TSH','TotalT3','TotalT4', 'Cortisol'))
ggplot(data.melt2, aes(y=value, x=Date, color=variable))+
geom_point()+
geom_smooth(se=F)+
theme(panel.background = element_rect(fill="#ffffff"),
axis.text.x=element_text(angle=60, hjust=1),
axis.line = element_line(color="#000000"))+
scale_x_date(date_breaks = "1 month", date_labels = "%b")+
ylab("Hormone Levels (Units)")+
xlab("Date of Sample Collection")+
labs(color='Hormones')
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
#There are some dates here from 2013, but the concentration values are blank, so I removed them from the graphical display.
#create_report(data.melt2) - Use when needed.
#This may be irrelevant if we use a PCA, but doing it anyway just to see...
#I can always do this again but with the simplified results of the PCA
#Cortisol
Mn_Cortisol <- ggplot(Metals, aes(x=log(Mn55), y=Cortisol))+geom_point()
Cu_Cortisol <- ggplot(Metals, aes(x=log(Cu65), y=Cortisol))+geom_point()
Cd_Cortisol <- ggplot(Metals, aes(x=log(Cd111), y=Cortisol))+geom_point()
Hg_Cortisol <- ggplot(Metals, aes(x=log(Hg202), y=Cortisol))+geom_point()
Pb_Cortisol <- ggplot(Metals, aes(x=log(Pb208), y=Cortisol))+geom_point()
U_Cortisol <- ggplot(Metals, aes(x=log(U238), y=Cortisol))+geom_point()
As_Cortisol <- ggplot(Metals, aes(x=log(As75), y=Cortisol))+geom_point()
Fe_Cortisol <- ggplot(Metals, aes(x=log(Fe57), y=Cortisol))+geom_point()
grid.arrange(Mn_Cortisol, Cu_Cortisol, Cd_Cortisol, Hg_Cortisol, Pb_Cortisol, U_Cortisol, As_Cortisol, Fe_Cortisol)
## Warning: Removed 129 rows containing missing values (geom_point).
## Removed 129 rows containing missing values (geom_point).
## Removed 129 rows containing missing values (geom_point).
## Warning: Removed 215 rows containing missing values (geom_point).
## Warning: Removed 129 rows containing missing values (geom_point).
## Removed 129 rows containing missing values (geom_point).
## Warning in log(As75): NaNs produced
## Warning in log(As75): NaNs produced
## Warning: Removed 297 rows containing missing values (geom_point).
## Warning: Removed 310 rows containing missing values (geom_point).
This project will consider the associations between concentration levels of metals and metalloids measured in hair samples of 330 Yuma County residents in 2018 and the prevalence of negative health outcomes, including endocrine disruption, reproductive disorders, obesity, and neurological disorders.
\(Note:\) Before running this code, Data_Clean.Rmd and Yuma_Metals_Analysis.Rmb should be run to clean the dataset used in this analysis.
colnames(Metals)
## [1] "ID" "Mn55"
## [3] "Cu65" "Cd111"
## [5] "Hg202" "Pb208"
## [7] "U238" "Site"
## [9] "Gender" "DOB_month"
## [11] "DOB_day" "DOB_year"
## [13] "Ethnicity" "Race_ai_an"
## [15] "Race_asian" "Race_black"
## [17] "Race_pac_isl" "Race_white"
## [19] "Race_other" "Race_dk"
## [21] "Race_refuse" "Race_other_TEXT"
## [23] "Marital_status" "Num_in_household"
## [25] "Num_children" "Child_under_18"
## [27] "Education" "Education_TEXT"
## [29] "Birth_country" "Birth_country_TEXT"
## [31] "Armed_forces" "Army"
## [33] "Navy" "Marine_corp"
## [35] "Air_force" "Coast_guard"
## [37] "Other" "Armed_forces_dk"
## [39] "Armed_forces_refuse" "Armed_forces_TEXT"
## [41] "Work_for_pay" "Self_employed"
## [43] "Looking" "Laid_off"
## [45] "Retired" "Homemaker"
## [47] "Student" "Disabled"
## [49] "Maternity_sick_leave" "employ_other"
## [51] "employ_other_A" "employ_dk"
## [53] "employ_TEXT" "Occupation"
## [55] "Farmer" "Pesticides"
## [57] "Lived_in_Yuma" "Born_in_Yuma"
## [59] "Yuma_pregnant_mom" "Residence_type"
## [61] "Residence_TEXT" "Own_rent"
## [63] "Own_rent_TEXT" "Homeless"
## [65] "Water_home" "Water_home_TEXT"
## [67] "Water_cooking" "Water_cooking_TEXT"
## [69] "Water_coffee" "Water_coffee_TEXT"
## [71] "Water_work" "Water_work_TEXT"
## [73] "Health_care" "Health_care_source"
## [75] "Health_care_source_TEXT" "Healthcare_lastyear"
## [77] "Health_consult" "Health_consult_place"
## [79] "Health_consult_place_TEXT" "Routine_care"
## [81] "Routine_care_TEXT" "Doctor_time"
## [83] "ER_visits" "afford_rx_meds"
## [85] "afford_mh_care" "afford_dental_care"
## [87] "afford_eyeglasses" "afford_specialist"
## [89] "afford_follow_up" "Delay_telephone"
## [91] "Delay_appt" "Delay_wait"
## [93] "Delay_open" "Delay_transportation"
## [95] "Mental_health" "General_health"
## [97] "Health_compare" "Height_ft"
## [99] "Height_inch" "Weight"
## [101] "Overweight" "Dental_visit"
## [103] "Dental_health" "arthritis"
## [105] "gout" "chf"
## [107] "chd" "angnia"
## [109] "heart_attack" "stroke"
## [111] "emphysema" "thyroid_prob"
## [113] "hyperthyroid" "hypothyroid"
## [115] "thyroid_cancer" "goiter"
## [117] "weak_kidney" "cancer"
## [119] "pos" "undescended"
## [121] "ambiguous" "HTN"
## [123] "HTN_hx_meds" "HTN_current_med"
## [125] "Cholesterol" "Chol_med"
## [127] "Diabetes" "DM_age"
## [129] "DM_age_TEXT" "insulin"
## [131] "diabetic_pills" "Income"
## [133] "age" "thyroid_prob_D"
## [135] "Hyper_D" "Hypo_D"
## [137] "Thyroid_cancer_D" "Goiter_D"
## [139] "thyroid_conditions" "clinical"
## [141] "endo" "fT3"
## [143] "TSH" "fT4"
## [145] "TotalT4" "TotalT3"
## [147] "Cortisol" "latitude"
## [149] "longitude" "emr_height_inches"
## [151] "emr_weight" "emr_bmi"
## [153] "emr_thyroid_problem" "emr_hyperthyroidism"
## [155] "emr_hypothyroidism" "emr_thyroid_cancer"
## [157] "emr_goiter" "emr_cancer"
## [159] "emr_pos" "emr_diabetes"
## [161] "emr_obesity" "sc"
## [163] "emr_sud" "emr_asd"
## [165] "meds" "meds_name"
## [167] "thyroid_med" "free_t3_med"
## [169] "tsh_med" "total_t4_med"
## [171] "free_t4_med" "total_t3_med"
## [173] "cortisol_med" "Fe57"
## [175] "As75" "height_total"
## [177] "bmi" "age_group"
## [179] "Site_Collapsed" "hyper_recode"
## [181] "hypo_recode" "thy_cancer_recode"
## [183] "goiter_recode" "pos_recode"
## [185] "dm_recode" "emr_dm_recode"
## [187] "sud" "tobacco_history"
## [189] "number_years" "pack_per_day"
## [191] "quit_date" "tremor"
## [193] "FAC1_1" "FAC2_1"
## [195] "FAC3_1" "FAC4_1"
## [197] "FAC5_1" "Date"
## [199] "DOB"
#Date range of responses
kable(Metals %>%
# transform to date format with lubridate
mutate(Date = ymd(Date)) %>%
# find min and max
summarise(min = min(Date, na.rm = T),
max = max(Date, na.rm = T)))
| min | max |
|---|---|
| 2018-04-20 | 2018-11-26 |
kable(Metals %>%
filter(!is.na(ID)) %>%
summarize("n"=length(ID)))
| n |
|---|
| 330 |
kable(t(Metals %>%
summarize("Male"=sum(Gender==1, na.rm = T),
"Female"=sum(Gender==2, na.rm = T),
"Refused Gender"=sum(Gender==3, na.rm = T),
"18-39"=sum(age>=18 & age<40, na.rm = T),
"40-59"=sum(age>=40 & age<60, na.rm = T),
"60+"=sum(age>=60, na.rm = T),
"Hispanic"=sum(Ethnicity==1, na.rm = T),
"Not Hispanic"=sum(Ethnicity==2, na.rm = T),
"Married"=sum(Marital_status==1, na.rm=T),
"Widowed"=sum(Marital_status==2, na.rm=T),
"Divorced"=sum(Marital_status==3, na.rm=T),
"Separated"=sum(Marital_status==4, na.rm=T),
"Never married"=sum(Marital_status==5, na.rm=T),
"No school"=sum(Education==1, na.rm = T),
"Elementary school"=sum(Education==2, na.rm = T),
"Middle school"=sum(Education==3, na.rm = T),
"High school"=sum(Education==4, na.rm = T),
"College"=sum(Education==5, na.rm = T),
"Advanced degree"=sum(Education==6, na.rm = T),
"Other"=sum(Education==7, na.rm = T))))
| Male | 68 |
| Female | 255 |
| Refused Gender | 0 |
| 18-39 | 93 |
| 40-59 | 153 |
| 60+ | 72 |
| Hispanic | 286 |
| Not Hispanic | 37 |
| Married | 197 |
| Widowed | 20 |
| Divorced | 44 |
| Separated | 10 |
| Never married | 48 |
| No school | 20 |
| Elementary school | 41 |
| Middle school | 53 |
| High school | 93 |
| College | 80 |
| Advanced degree | 18 |
| Other | 18 |
kable(t(Metals %>%
group_by(Site) %>%
summarize("n"=length(Site),
"Male"=sum(Gender==1, na.rm = T),
"Female"=sum(Gender==2, na.rm = T),
"Refused Gender"=sum(Gender==3, na.rm = T),
"18-39"=sum(age>=18 & age<40, na.rm = T),
"40-59"=sum(age>=40 & age<60, na.rm = T),
"60+"=sum(age>=60, na.rm = T),
"Hispanic"=sum(Ethnicity==1, na.rm = T),
"Not Hispanic"=sum(Ethnicity==2, na.rm = T),
"Married"=sum(Marital_status==1, na.rm=T),
"Widowed"=sum(Marital_status==2, na.rm=T),
"Divorced"=sum(Marital_status==3, na.rm=T),
"Separated"=sum(Marital_status==4, na.rm=T),
"Never married"=sum(Marital_status==5, na.rm=T),
"No school"=sum(Education==1, na.rm = T),
"Elementary school"=sum(Education==2, na.rm = T),
"Middle school"=sum(Education==3, na.rm = T),
"High school"=sum(Education==4, na.rm = T),
"College"=sum(Education==5, na.rm = T),
"Advanced degree"=sum(Education==6, na.rm = T),
"Other"=sum(Education==7, na.rm = T))))
| Site | 1 | 2 | 3 |
| n | 78 | 126 | 126 |
| Male | 28 | 17 | 23 |
| Female | 47 | 106 | 102 |
| Refused Gender | 0 | 0 | 0 |
| 18-39 | 22 | 19 | 52 |
| 40-59 | 35 | 69 | 49 |
| 60+ | 15 | 35 | 22 |
| Hispanic | 75 | 123 | 88 |
| Not Hispanic | 0 | 0 | 37 |
| Married | 35 | 86 | 76 |
| Widowed | 8 | 5 | 7 |
| Divorced | 9 | 16 | 19 |
| Separated | 4 | 6 | 0 |
| Never married | 16 | 10 | 22 |
| No school | 6 | 13 | 1 |
| Elementary school | 6 | 33 | 2 |
| Middle school | 13 | 28 | 12 |
| High school | 20 | 32 | 41 |
| College | 20 | 11 | 49 |
| Advanced degree | 4 | 2 | 12 |
| Other | 6 | 4 | 8 |
#Below will give a % of folks who rated their general health as fair (4) or poor (5)
(Metals %>%
filter(General_health==4|General_health==5) %>%
summarise(Count = n()))/sum(!is.na(Metals$General_health))
## Count
## 1 0.4643963
#Below will give a % of folks who are listed as having a "thyroid problem"
(Metals %>%
filter(thyroid_prob_D==1) %>%
summarise(Count = n()))/sum(!is.na(Metals$thyroid_prob_D))
## Count
## 1 0.44375
(Metals %>%
filter(emr_thyroid_problem==1) %>%
summarise(Count = n()))/sum(!is.na(Metals$emr_thyroid_problem))
## Count
## 1 0.3174603
Metals$age <- as.numeric(Metals$age)
knitr::kable(Metals %>%
summarise(mean=round(mean(age, na.rm=T), digits=1),
median=round(median(age, na.rm=T), digits=1),
min=round(min(age, na.rm=T), digits=1),
max=round(max(age, na.rm=T), digits=1),
NAs=sum(is.na(age))))
| mean | median | min | max | NAs |
|---|---|---|---|---|
| 48.4 | 50.4 | 18.5 | 89.9 | 12 |
knitr::kable(summary(round(Metals[c("emr_height_inches", "emr_weight", "emr_bmi", "height_total", "Weight", "bmi")]), digits=1), col.names = c("EMR Height (inches)", "EMR Weight (lbs)", "EMR BMI", "Measured Height (inches)", "Measured Weight (lbs)", "Measured BMI"))
| EMR Height (inches) | EMR Weight (lbs) | EMR BMI | Measured Height (inches) | Measured Weight (lbs) | Measured BMI | |
|---|---|---|---|---|---|---|
| Min. :52 | Min. : 92 | Min. :17 | Min. :54 | Min. : 97 | Min. :18 | |
| 1st Qu.:61 | 1st Qu.:155 | 1st Qu.:27 | 1st Qu.:62 | 1st Qu.:152 | 1st Qu.:26 | |
| Median :63 | Median :185 | Median :32 | Median :64 | Median :179 | Median :30 | |
| Mean :63 | Mean :190 | Mean :33 | Mean :64 | Mean :180 | Mean :31 | |
| 3rd Qu.:65 | 3rd Qu.:215 | 3rd Qu.:37 | 3rd Qu.:66 | 3rd Qu.:200 | 3rd Qu.:35 | |
| Max. :72 | Max. :360 | Max. :64 | Max. :80 | Max. :360 | Max. :56 | |
| NA’s :138 | NA’s :135 | NA’s :150 | NA’s :12 | NA’s :7 | NA’s :12 |
t.test(Metals$emr_bmi, Metals$bmi, paired = T)
##
## Paired t-test
##
## data: Metals$emr_bmi and Metals$bmi
## t = 5.3486, df = 172, p-value = 2.793e-07
## alternative hypothesis: true mean difference is not equal to 0
## 95 percent confidence interval:
## 0.7307578 1.5855707
## sample estimates:
## mean difference
## 1.158164
#Some data is missing from the EMRs for BMI. The BMIs between the the EMR and measurements are different in general, but also there are differences between the paired data.
summary(aov(Gender~as.factor(Site), data=Metals))
## Df Sum Sq Mean Sq F value Pr(>F)
## as.factor(Site) 2 2.72 1.3596 8.536 0.000244 ***
## Residuals 320 50.97 0.1593
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 7 observations deleted due to missingness
summary(aov(age~as.factor(Site), data=Metals))
## Df Sum Sq Mean Sq F value Pr(>F)
## as.factor(Site) 2 4099 2049.5 8.998 0.000158 ***
## Residuals 315 71749 227.8
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 12 observations deleted due to missingness
summary(aov(Ethnicity~as.factor(Site), data=Metals))
## Df Sum Sq Mean Sq F value Pr(>F)
## as.factor(Site) 2 6.714 3.357 41.24 <2e-16 ***
## Residuals 320 26.048 0.081
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 7 observations deleted due to missingness
summary(aov(Marital_status~as.factor(Site), data=Metals))
## Df Sum Sq Mean Sq F value Pr(>F)
## as.factor(Site) 2 31.0 15.484 6.433 0.00182 **
## Residuals 320 770.2 2.407
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 7 observations deleted due to missingness
summary(aov(Education~as.factor(Site), data=Metals))
## Df Sum Sq Mean Sq F value Pr(>F)
## as.factor(Site) 2 138.2 69.10 38.74 8.61e-16 ***
## Residuals 320 570.9 1.78
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 7 observations deleted due to missingness
pairwise.t.test(Metals$Gender, as.factor(Metals$Site), adj="bonf")
##
## Pairwise comparisons using t tests with pooled SD
##
## data: Metals$Gender and as.factor(Metals$Site)
##
## 1 2
## 2 0.00022 -
## 3 0.00257 0.36700
##
## P value adjustment method: holm
pairwise.t.test(Metals$age, as.factor(Metals$Site), adj="bonf")
##
## Pairwise comparisons using t tests with pooled SD
##
## data: Metals$age and as.factor(Metals$Site)
##
## 1 2
## 2 0.0414 -
## 3 0.1992 0.0001
##
## P value adjustment method: holm
pairwise.t.test(Metals$Ethnicity, as.factor(Metals$Site), adj="bonf")
##
## Pairwise comparisons using t tests with pooled SD
##
## data: Metals$Ethnicity and as.factor(Metals$Site)
##
## 1 2
## 2 1 -
## 3 1.6e-11 2.2e-14
##
## P value adjustment method: holm
pairwise.t.test(Metals$Marital_status, as.factor(Metals$Site), adj="bonf")
##
## Pairwise comparisons using t tests with pooled SD
##
## data: Metals$Marital_status and as.factor(Metals$Site)
##
## 1 2
## 2 0.0012 -
## 3 0.0740 0.0857
##
## P value adjustment method: holm
pairwise.t.test(Metals$Education, as.factor(Metals$Site), adj="bonf")
##
## Pairwise comparisons using t tests with pooled SD
##
## data: Metals$Education and as.factor(Metals$Site)
##
## 1 2
## 2 1.2e-05 -
## 3 0.003 3.5e-16
##
## P value adjustment method: holm
Explore the distributional properties of (hair?) metals concentrations from Yuma, AZ. Also, use Rmd file to illustrate use of R and markdown for computationally reproducible research.
Standard and robust PCA yield generally similar results. There are enough differences, however, that the robust method would be my first choice for data summarization.
#setwd
setwd("~/Desktop/Research/Yuma/Yuma - R Project/")
## read data
##filename <- c("Yuma Metals Project - EMR 19Jun20.xlsx")
filename <- c("Yuma Metals Project_EMR_Editted 23Aug2021 mental health & ID removed.xlsx") ## new data set
tmp <- read_excel(filename, sheet=3)
dim(tmp) ## 330 222 ## same dimensions
## [1] 330 199
names(tmp)
## [1] "ID" "Mn55"
## [3] "Cu65" "Cd111"
## [5] "Hg202" "Pb208"
## [7] "U238" "Site"
## [9] "Date" "Gender"
## [11] "DOB_month" "DOB_day"
## [13] "DOB_year" "DOB"
## [15] "Ethnicity" "Race_ai_an"
## [17] "Race_asian" "Race_black"
## [19] "Race_pac_isl" "Race_white"
## [21] "Race_other" "Race_dk"
## [23] "Race_refuse" "Race_other_TEXT"
## [25] "Marital_status" "Num_in_household"
## [27] "Num_children" "Child_under_18"
## [29] "Education" "Education_TEXT"
## [31] "Birth_country" "Birth_country_TEXT"
## [33] "Armed_forces" "Army"
## [35] "Navy" "Marine_corp"
## [37] "Air_force" "Coast_guard"
## [39] "Other" "Armed_forces_dk"
## [41] "Armed_forces_refuse" "Armed_forces_TEXT"
## [43] "Work_for_pay" "Self_employed"
## [45] "Looking" "Laid_off"
## [47] "Retired" "Homemaker"
## [49] "Student" "Disabled"
## [51] "Maternity_sick_leave" "employ_other"
## [53] "employ_other_A" "employ_dk"
## [55] "employ_TEXT" "Occupation"
## [57] "Farmer" "Pesticides"
## [59] "Lived_in_Yuma" "Born_in_Yuma"
## [61] "Yuma_pregnant_mom" "Residence_type"
## [63] "Residence_TEXT" "Own_rent"
## [65] "Own_rent_TEXT" "Homeless"
## [67] "Water_home" "Water_home_TEXT"
## [69] "Water_cooking" "Water_cooking_TEXT"
## [71] "Water_coffee" "Water_coffee_TEXT"
## [73] "Water_work" "Water_work_TEXT"
## [75] "Health_care" "Health_care_source"
## [77] "Health_care_source_TEXT" "Healthcare_lastyear"
## [79] "Health_consult" "Health_consult_place"
## [81] "Health_consult_place_TEXT" "Routine_care"
## [83] "Routine_care_TEXT" "Doctor_time"
## [85] "ER_visits" "afford_rx_meds"
## [87] "afford_mh_care" "afford_dental_care"
## [89] "afford_eyeglasses" "afford_specialist"
## [91] "afford_follow_up" "Delay_telephone"
## [93] "Delay_appt" "Delay_wait"
## [95] "Delay_open" "Delay_transportation"
## [97] "Mental_health" "General_health"
## [99] "Health_compare" "Height_ft"
## [101] "Height_inch" "Weight"
## [103] "Overweight" "Dental_visit"
## [105] "Dental_health" "arthritis"
## [107] "gout" "chf"
## [109] "chd" "angnia"
## [111] "heart_attack" "stroke"
## [113] "emphysema" "thyroid_prob"
## [115] "hyperthyroid" "hypothyroid"
## [117] "thyroid_cancer" "goiter"
## [119] "weak_kidney" "cancer"
## [121] "pos" "undescended"
## [123] "ambiguous" "HTN"
## [125] "HTN_hx_meds" "HTN_current_med"
## [127] "Cholesterol" "Chol_med"
## [129] "Diabetes" "DM_age"
## [131] "DM_age_TEXT" "insulin"
## [133] "diabetic_pills" "Income"
## [135] "age" "thyroid_prob_D"
## [137] "Hyper_D" "Hypo_D"
## [139] "Thyroid_cancer_D" "Goiter_D"
## [141] "thyroid_conditions" "clinical"
## [143] "endo" "fT3"
## [145] "TSH" "fT4"
## [147] "TotalT4" "TotalT3"
## [149] "Cortisol" "latitude"
## [151] "longitude" "emr_height_inches"
## [153] "emr_weight" "emr_bmi"
## [155] "emr_thyroid_problem" "emr_hyperthyroidism"
## [157] "emr_hypothyroidism" "emr_thyroid_cancer"
## [159] "emr_goiter" "emr_cancer"
## [161] "emr_pos" "emr_diabetes"
## [163] "emr_obesity" "sc"
## [165] "emr_sud" "emr_asd"
## [167] "meds" "meds_name"
## [169] "thyroid_med" "free_t3_med"
## [171] "tsh_med" "total_t4_med"
## [173] "free_t4_med" "total_t3_med"
## [175] "cortisol_med" "Fe_57"
## [177] "As_75" "height_total"
## [179] "bmi" "age_group"
## [181] "Site_Collapsed" "hyper_recode"
## [183] "hypo_recode" "thy_cancer_recode"
## [185] "goiter_recode" "pos_recode"
## [187] "dm_recode" "emr_dm_recode"
## [189] "sud" "tobacco_history"
## [191] "number_years" "pack_per_day"
## [193] "quit_date" "tremor"
## [195] "FAC1_1" "FAC2_1"
## [197] "FAC3_1" "FAC4_1"
## [199] "FAC5_1"
## [1] "ID" "Mn55"
## [3] "Cu65" "Cd111"
## [5] "Hg202" "Pb206"
## [7] "Pb207" "PB208"
## [9] "U238" "Site"
## [11] "Date" "Gender"
## head(tmp)
## keep only ID, metals and Site, cast to data frame
metals <- data.frame(tmp[,c(1:8)])
names(metals) ## omitted Pb206 and Pb207
## [1] "ID" "Mn55" "Cu65" "Cd111" "Hg202" "Pb208" "U238" "Site"
## metals check format
The summary below shows there are 123 NAs for nearly all of the metals. Hg202 is the exception with 212 NAs.
Data exploration shows that the NA pattern is consistent across the metals. If one is missing, all are missing.
There are fewer measures at site 1. More missing values at site 1 (51/78) and at site 3 (48/126) compared with site 2 (24/126)
## Initial summary
summary(metals)
## ID Mn55 Cu65 Cd111
## Length:330 Min. : 0.06 Min. : 0.1626 Min. :0.0495
## Class :character 1st Qu.: 0.36 1st Qu.: 10.4350 1st Qu.:0.0495
## Mode :character Median : 0.75 Median : 20.7800 Median :0.0495
## Mean : 3.68 Mean : 76.2073 Mean :0.1292
## 3rd Qu.: 1.72 3rd Qu.: 49.9650 3rd Qu.:0.0495
## Max. :103.32 Max. :1704.1300 Max. :3.1000
## NA's :123 NA's :123 NA's :123
## Hg202 Pb208 U238 Site
## Min. : 0.00707 Min. : 0.020 Min. : 0.03536 Min. :1.000
## 1st Qu.: 0.13050 1st Qu.: 0.180 1st Qu.: 0.03536 1st Qu.:2.000
## Median : 0.27300 Median : 0.440 Median : 0.08000 Median :2.000
## Mean : 1.59401 Mean : 1.825 Mean : 0.36315 Mean :2.145
## 3rd Qu.: 0.48325 3rd Qu.: 1.160 3rd Qu.: 0.21500 3rd Qu.:3.000
## Max. :60.17500 Max. :76.450 Max. :21.08000 Max. :3.000
## NA's :212 NA's :123 NA's :123
## Number of Observations per Site
table(metals$Site) ## 78 126 and 126. Fewer at site 1
##
## 1 2 3
## 78 126 126
## Number of values > BDL by Site
table(metals$Site, !is.na(metals[,2])) ## Observation indicator by site
##
## FALSE TRUE
## 1 51 27
## 2 24 102
## 3 48 78
The figure below shows metals concentrations on log10 scale. NA values are omitted.
The shape of the Pb distributions are nearly identical (omitted 206 and 207). On the whole, these distributions are not as bad as I feared. All are stil right-skewed after transformation, but they do not look pathologic.
## change from wide to long format for plotting
metals %>% pivot_longer(-c(ID, Site), names_to="Metal", values_to="Concentration") %>%
ggplot(aes(x=Concentration)) + geom_density() +
scale_x_log10() + facet_wrap(~ Metal) +
ggtitle("Distributions of Metals Concentrations (log10 transform)")
## Warning: Removed 827 rows containing non-finite values (stat_density).
Metals concentrations - log10 scale
Values labeled as NA across (nearly all metals) are not measured. For each observation with missing metals, omit the observation. In addition there are 89 observations with missing Hg values. We may impute these missing values based on site and other observed metals
Note - Other BDL values are replaced with 1/2 the limit of detection.
names(metals)
## [1] "ID" "Mn55" "Cu65" "Cd111" "Hg202" "Pb208" "U238" "Site"
metals.obs <- metals[!is.na(metals[,2]), ]
## for Hg, replace NA and zero with 0.029/sqrt(2)
##sort(metals.obs[,"Hg202"])
## Hg.subs <- 0.029 /sqrt(2)
metals.complete.cases <- metals.obs %>% filter(!is.na(Cu65)) %>%
filter(!is.na(Hg202)) ## ID R1108 also missing Hg, other metals observed
dim(metals.complete.cases) ## 111 8 wow - small data set
## [1] 111 8
summary(metals.complete.cases)
## ID Mn55 Cu65 Cd111
## Length:111 Min. : 0.070 Min. : 0.1626 Min. :0.04950
## Class :character 1st Qu.: 0.355 1st Qu.: 14.1900 1st Qu.:0.04950
## Mode :character Median : 0.810 Median : 24.7600 Median :0.04950
## Mean : 4.727 Mean : 94.9425 Mean :0.13170
## 3rd Qu.: 2.175 3rd Qu.: 73.0850 3rd Qu.:0.06475
## Max. :103.320 Max. :1088.6500 Max. :2.87000
## Hg202 Pb208 U238 Site
## Min. : 0.00707 Min. : 0.020 Min. :0.03535 Min. :1.000
## 1st Qu.: 0.14550 1st Qu.: 0.185 1st Qu.:0.03535 1st Qu.:2.000
## Median : 0.29100 Median : 0.450 Median :0.08000 Median :2.000
## Mean : 1.68130 Mean : 2.230 Mean :0.34223 Mean :2.216
## 3rd Qu.: 0.49500 3rd Qu.: 1.225 3rd Qu.:0.24500 3rd Qu.:3.000
## Max. :60.17500 Max. :76.450 Max. :9.24000 Max. :3.000
metals.impute.Hg <- metals.obs %>% filter(!is.na(Cu65)) %>%
filter(is.na(Hg202))
metals.complete.log10 <- cbind(ID=metals.complete.cases[,"ID"],
log10(metals.complete.cases[,2:7]),
Site=metals.complete.cases[,"Site"])
summary(metals.complete.log10)
## ID Mn55 Cu65 Cd111
## Length:111 Min. :-1.15490 Min. :-0.7888 Min. :-1.3054
## Class :character 1st Qu.:-0.45016 1st Qu.: 1.1520 1st Qu.:-1.3054
## Mode :character Median :-0.09151 Median : 1.3938 Median :-1.3054
## Mean : 0.05038 Mean : 1.5370 Mean :-1.1448
## 3rd Qu.: 0.33745 3rd Qu.: 1.8638 3rd Qu.:-1.2012
## Max. : 2.01418 Max. : 3.0369 Max. : 0.4579
## Hg202 Pb208 U238 Site
## Min. :-2.1505 Min. :-1.69897 Min. :-1.4515 Min. :1.000
## 1st Qu.:-0.8373 1st Qu.:-0.73299 1st Qu.:-1.4515 1st Qu.:2.000
## Median :-0.5361 Median :-0.34679 Median :-1.0969 Median :2.000
## Mean :-0.5300 Mean :-0.31046 Mean :-0.9353 Mean :2.216
## 3rd Qu.:-0.3055 3rd Qu.: 0.08784 3rd Qu.:-0.6109 3rd Qu.:3.000
## Max. : 1.7794 Max. : 1.88338 Max. : 0.9657 Max. :3.000
round(cor(metals.complete.log10[,2:7]), 2)
## Mn55 Cu65 Cd111 Hg202 Pb208 U238
## Mn55 1.00 0.48 0.46 0.07 0.55 0.15
## Cu65 0.48 1.00 0.30 0.15 0.59 0.20
## Cd111 0.46 0.30 1.00 0.15 0.60 0.20
## Hg202 0.07 0.15 0.15 1.00 0.21 0.09
## Pb208 0.55 0.59 0.60 0.21 1.00 0.26
## U238 0.15 0.20 0.20 0.09 0.26 1.00
metals.impute.log10 <- cbind(ID=metals.impute.Hg[,"ID"],
log10(metals.impute.Hg[,2:7]),
Site=metals.impute.Hg[,"Site"])
summary(metals.impute.log10)
## ID Mn55 Cu65 Cd111
## Length:95 Min. :-1.22185 Min. :-0.7888 Min. :-1.3054
## Class :character 1st Qu.:-0.43807 1st Qu.: 0.9668 1st Qu.:-1.3054
## Mode :character Median :-0.16749 Median : 1.1884 Median :-1.3054
## Mean :-0.09223 Mean : 1.1755 Mean :-1.1551
## 3rd Qu.: 0.09320 3rd Qu.: 1.4984 3rd Qu.:-1.3054
## Max. : 1.56785 Max. : 3.2315 Max. : 0.4914
##
## Hg202 Pb208 U238 Site
## Min. : NA Min. :-1.69897 Min. :-1.4515 Min. :1.000
## 1st Qu.: NA 1st Qu.:-0.76955 1st Qu.:-1.4515 1st Qu.:2.000
## Median : NA Median :-0.38722 Median :-1.1549 Median :2.000
## Mean :NaN Mean :-0.33314 Mean :-1.0172 Mean :2.284
## 3rd Qu.: NA 3rd Qu.:-0.02228 3rd Qu.:-0.7447 3rd Qu.:3.000
## Max. : NA Max. : 1.30168 Max. : 1.3239 Max. :3.000
## NA's :95
metals.impute.Hg
## ID Mn55 Cu65 Cd111 Hg202 Pb208 U238 Site
## 1 R1005 0.39 59.8300000 0.04949747 NA 0.36 0.03535534 2
## 2 R1001 1.43 33.5000000 0.04949747 NA 0.17 0.07000000 2
## 3 C1005 17.88 179.6800000 0.28000000 NA 20.03 0.14000000 1
## 4 R1011 0.68 44.3900000 0.04949747 NA 0.27 0.06000000 2
## 5 R1009 12.87 13.6600000 0.18000000 NA 6.51 0.25000000 2
## 6 C1013 0.82 9.2600000 0.04949747 NA 0.30 0.03535534 1
## 7 C1014 0.51 32.0500000 0.04949747 NA 0.26 0.03535534 1
## 8 R1018 0.40 18.8100000 0.04949747 NA 0.06 0.03535534 2
## 9 R1019 0.96 9.6300000 0.13000000 NA 2.87 0.03535534 2
## 10 C1017 0.28 13.9100000 0.04949747 NA 0.11 0.03535534 1
## 11 C1022 1.20 22.6400000 0.04949747 NA 0.95 0.15000000 1
## 12 C1026 0.17 6.2200000 0.04949747 NA 1.72 0.22000000 1
## 13 C1018 0.16 22.8200000 0.04949747 NA 0.08 0.23000000 1
## 14 R1021 1.28 0.1626346 0.04949747 NA 0.68 0.03535534 2
## 15 R1022 1.11 97.6600000 0.04949747 NA 0.62 0.64000000 2
## 16 R1025 3.33 29.4200000 0.11000000 NA 5.46 0.37000000 2
## 17 R1026 2.03 21.9900000 0.42000000 NA 0.95 0.25000000 2
## 18 R1032 0.54 32.1600000 0.04949747 NA 0.81 1.50000000 2
## 19 R1043 1.05 13.6900000 0.04949747 NA 0.53 0.06000000 2
## 20 R1050 0.75 20.7800000 0.04949747 NA 0.04 0.03535534 2
## 21 R1049 0.18 7.0400000 0.04949747 NA 0.48 0.03535534 2
## 22 R1051 11.08 15.6500000 0.04949747 NA 0.14 0.05000000 2
## 23 R1062 0.57 0.1626346 0.04949747 NA 0.02 0.03535534 2
## 24 R1035 0.47 59.0400000 0.04949747 NA 0.38 0.03535534 2
## 25 R1059 0.78 23.8800000 0.04949747 NA 1.25 0.03535534 2
## 26 C1035 0.46 9.8000000 0.04949747 NA 0.73 0.03535534 1
## 27 C1038 0.71 9.4900000 0.17000000 NA 0.88 0.03535534 1
## 28 C1037 0.81 3.3000000 0.09000000 NA 1.69 0.03535534 1
## 29 C1036 1.71 27.6000000 0.04949747 NA 0.06 0.05000000 1
## 30 R1038 0.50 19.5300000 0.12000000 NA 0.41 0.13000000 2
## 31 R1070 0.49 32.3300000 0.04949747 NA 0.12 0.03535534 2
## 32 R1040 0.81 27.0800000 0.04949747 NA 0.68 0.03535534 2
## 33 R1071 6.16 12.0800000 0.04949747 NA 1.15 0.22000000 2
## 34 R1074 0.61 4.6500000 0.04949747 NA 0.40 0.38000000 2
## 35 R1084 1.00 43.8200000 0.04949747 NA 0.16 0.03535534 2
## 36 R1082 21.35 131.1400000 0.04949747 NA 1.17 0.03535534 2
## 37 R1086 5.91 149.6300000 0.14000000 NA 1.52 0.29000000 2
## 38 R1093 0.35 10.1400000 0.04949747 NA 0.41 0.03535534 2
## 39 R1102 0.35 21.7900000 0.04949747 NA 0.10 0.03535534 2
## 40 R1103 0.30 47.7300000 0.04949747 NA 0.58 0.03535534 2
## 41 C1057 2.28 0.1626346 0.04949747 NA 3.19 0.03535534 1
## 42 R1101 0.54 62.4500000 0.04949747 NA 0.78 0.11000000 2
## 43 R1107 0.76 19.5300000 0.04949747 NA 0.61 0.05000000 2
## 44 R1109 6.77 417.0400000 0.30000000 NA 13.79 1.10000000 2
## 45 R1112 0.55 13.0900000 0.04949747 NA 1.20 0.17000000 2
## 46 R1110 1.46 9.4500000 0.04949747 NA 1.75 0.20000000 2
## 47 Y1019 0.15 6.0300000 0.04949747 NA 0.06 0.03535534 3
## 48 Y1017 1.02 16.9600000 0.04949747 NA 1.48 0.06000000 3
## 49 Y1018 0.38 8.7900000 0.04949747 NA 0.38 0.42000000 3
## 50 Y1020 0.24 30.1000000 0.04949747 NA 0.14 0.10000000 3
## 51 Y1021 0.34 21.7100000 0.04949747 NA 0.21 0.92000000 3
## 52 Y1024 0.38 12.9100000 0.04949747 NA 0.13 0.11000000 3
## 53 Y1023 0.41 10.1000000 0.04949747 NA 0.34 0.18000000 3
## 54 R1116 12.11 1704.1300000 0.34000000 NA 16.40 0.76000000 2
## 55 Y1026 0.41 7.6400000 0.04949747 NA 0.51 0.13000000 3
## 56 Y1028 1.06 74.7500000 0.04949747 NA 0.31 0.18000000 3
## 57 R1121 1.18 51.0100000 0.12000000 NA 1.41 0.14000000 2
## 58 R1122 0.61 244.5900000 0.11000000 NA 0.13 0.16000000 2
## 59 C1046 5.59 30.9600000 0.04949747 NA 0.67 0.03535534 1
## 60 R1123 0.75 11.3400000 0.04949747 NA 0.29 0.31000000 2
## 61 R1125 0.66 31.2100000 1.00000000 NA 0.86 0.03535534 2
## 62 Y1036 0.06 3.5500000 0.04949747 NA 0.21 0.26000000 3
## 63 Y1043 0.08 11.0700000 0.04949747 NA 0.40 0.15000000 3
## 64 Y1047 31.39 18.9300000 0.14000000 NA 1.86 1.00000000 3
## 65 Y1065 0.46 28.6700000 0.04949747 NA 0.25 0.03535534 3
## 66 Y1048 0.23 30.5000000 0.47000000 NA 0.31 0.03535534 3
## 67 Y1066 0.14 14.1600000 0.04949747 NA 0.15 0.07000000 3
## 68 Y1063 0.21 9.2700000 0.04949747 NA 0.51 0.09000000 3
## 69 Y1045 0.55 9.9700000 0.04949747 NA 0.20 0.14000000 3
## 70 Y1060 0.14 10.8600000 0.04949747 NA 0.17 0.70000000 3
## 71 Y1069 0.31 19.8400000 0.04949747 NA 0.10 0.03535534 3
## 72 Y1075 0.86 185.8700000 3.10000000 NA 0.86 0.03535534 3
## 73 Y1070 0.93 0.1626346 0.04949747 NA 1.72 0.13000000 3
## 74 Y1076 0.28 6.6900000 0.04949747 NA 0.12 0.03535534 3
## 75 Y1077 0.68 15.4300000 0.04949747 NA 0.19 0.03535534 3
## 76 Y1072 0.13 5.9300000 0.04949747 NA 0.06 0.09000000 3
## 77 Y1073 0.17 5.9000000 0.04949747 NA 0.16 0.09000000 3
## 78 Y1068 0.67 9.8500000 0.04949747 NA 0.16 0.34000000 3
## 79 Y1085 1.72 6.0000000 0.10000000 NA 0.85 0.03535534 3
## 80 Y1088 0.47 6.7400000 0.04949747 NA 0.17 0.03535534 3
## 81 Y1086 0.69 3.1100000 0.04949747 NA 0.18 0.03535534 3
## 82 Y1090 0.70 1.5200000 0.04949747 NA 2.06 0.03535534 3
## 83 Y1087 1.08 31.8000000 0.04949747 NA 0.83 0.12000000 3
## 84 Y1096 0.31 14.9800000 0.04949747 NA 0.39 0.03535534 3
## 85 Y1099 36.97 18.7300000 0.33000000 NA 0.86 0.03535534 3
## 86 Y1098 0.77 48.4900000 0.58000000 NA 2.92 0.16000000 3
## 87 Y1097 0.79 41.7600000 0.04949747 NA 0.11 0.25000000 3
## 88 Y1064 4.98 0.1626346 0.04949747 NA 10.17 21.08000000 3
## 89 Y1121 1.42 12.5500000 0.04949747 NA 0.09 0.03535534 3
## 90 Y1119 0.33 6.6000000 0.04949747 NA 0.86 0.17000000 3
## 91 Y1125 2.15 0.1626346 0.04949747 NA 1.59 0.03535534 3
## 92 Y1127 2.32 10.3300000 0.04949747 NA 0.34 0.16000000 3
## 93 Y1126 0.29 4.9300000 0.04949747 NA 0.25 0.39000000 3
## 94 C1104 3.67 506.8900000 0.18000000 NA 0.87 0.03535534 1
## 95 Y1008 0.14 12.6100000 0.04949747 NA 0.03 0.15000000 2
### fit regression line for Hg
Hg.lm <- lm(Hg202 ~ as.factor(Site) + Mn55 + Cu65 + Cd111 + Pb208 + U238,
data=metals.complete.log10)
qplot(metals.complete.log10[,"Hg202"], predict(Hg.lm))
qplot( predict(Hg.lm), resid(Hg.lm))
Hg.imputed <- predict(Hg.lm, newdata=metals.impute.log10)
metals.fillin.log10 <- metals.impute.log10
metals.fillin.log10[,"Hg202"] <- Hg.imputed
summary(metals.fillin.log10)
## ID Mn55 Cu65 Cd111
## Length:95 Min. :-1.22185 Min. :-0.7888 Min. :-1.3054
## Class :character 1st Qu.:-0.43807 1st Qu.: 0.9668 1st Qu.:-1.3054
## Mode :character Median :-0.16749 Median : 1.1884 Median :-1.3054
## Mean :-0.09223 Mean : 1.1755 Mean :-1.1551
## 3rd Qu.: 0.09320 3rd Qu.: 1.4984 3rd Qu.:-1.3054
## Max. : 1.56785 Max. : 3.2315 Max. : 0.4914
## Hg202 Pb208 U238 Site
## Min. :-0.9754 Min. :-1.69897 Min. :-1.4515 Min. :1.000
## 1st Qu.:-0.6857 1st Qu.:-0.76955 1st Qu.:-1.4515 1st Qu.:2.000
## Median :-0.5787 Median :-0.38722 Median :-1.1549 Median :2.000
## Mean :-0.5607 Mean :-0.33314 Mean :-1.0172 Mean :2.284
## 3rd Qu.:-0.4261 3rd Qu.:-0.02228 3rd Qu.:-0.7447 3rd Qu.:3.000
## Max. :-0.1436 Max. : 1.30168 Max. : 1.3239 Max. :3.000
## combine complete cases with Hg imputed cases
metals.allobs.log10 <- rbind(metals.complete.log10, metals.fillin.log10)
## dim(metals.allobs.log10) ## 206 8 ## OK obs with missing Cu65 is omitted.
## dim(metals.obs) ## 207 8
## check summary
summary(metals.allobs.log10)
## ID Mn55 Cu65 Cd111
## Length:206 Min. :-1.22185 Min. :-0.7888 Min. :-1.3054
## Class :character 1st Qu.:-0.44990 1st Qu.: 1.0261 1st Qu.:-1.3054
## Mode :character Median :-0.12494 Median : 1.3212 Median :-1.3054
## Mean :-0.01539 Mean : 1.3703 Mean :-1.1495
## 3rd Qu.: 0.23553 3rd Qu.: 1.7031 3rd Qu.:-1.3054
## Max. : 2.01418 Max. : 3.2315 Max. : 0.4914
## Hg202 Pb208 U238 Site
## Min. :-2.1505 Min. :-1.69897 Min. :-1.4515 Min. :1.000
## 1st Qu.:-0.7322 1st Qu.:-0.74473 1st Qu.:-1.4515 1st Qu.:2.000
## Median :-0.5680 Median :-0.36154 Median :-1.0969 Median :2.000
## Mean :-0.5442 Mean :-0.32092 Mean :-0.9731 Mean :2.248
## 3rd Qu.:-0.3821 3rd Qu.: 0.06446 3rd Qu.:-0.6626 3rd Qu.:3.000
## Max. : 1.7794 Max. : 1.88338 Max. : 1.3239 Max. :3.000
New figure with Hg values imputed using regression on other metals
## change from wide to long format for plotting
metals.allobs.log10 %>% pivot_longer(-c(ID, Site), names_to="Metal", values_to="Concentration") %>%
ggplot(aes(x=Concentration)) + geom_density() +
facet_wrap(~ Metal) +
ggtitle("Distributions of Metals Concentrations (missin Hg values imputed, log10 transform)")
Metals concentrations - log10 scale
Now we move on to Robust PCA. We use the method described by the following papers.
Hubert, M., Rousseeuw, P. J., and Vanden Branden, K. (2005), “ROBPCA: A New Approach to Robust Principal Component Analysis,” Technometrics, 47, 64–79.
Engelen, S., Hubert, M. and Vanden Branden, K. (2005), “A Comparison of Three Procedures for Robust PCA in High Dimensions”, Austrian Journal of Statistics, 34, 117–126.
Hubert, M., Rousseeuw, P. J., and Verdonck, T. (2009), “Robust PCA for Skewed Data and Its Outlier Map,” Computational Statistics & Data Analysis, 53, 2264–2274.
This is implemented in the R package \(rospca\)
The plot below shows results from the robust PCA. The “Score distance” describes the leverage that an indvidual point has in determining the fitted PCA surface. The “Orthogonal distance” shows the distance to the fitted surface.
Thus, points 31, 48, 85, and 200 heaviy influence the slope of the PCA surface (the loadings). Conversely, point 143 is a large distance from the mass of points, but has little influence on the loadings.
The eigenvalues (bottom of the box) show that of the variation is captured in PC1 (41%) and PC2 (20%).
##install.packages("rospca", repos="http://cran.r-project.org")
##dim(metals.adjusted)
##metals.log10 <- as.matrix(log10(metals.adjusted[, 2:7]))
colnames(metals.allobs.log10)
## [1] "ID" "Mn55" "Cu65" "Cd111" "Hg202" "Pb208" "U238" "Site"
## round(cor(metals.log10), 2) ## check pairwise correlation
### == original full PCA
metals.pca <- robpca(metals.allobs.log10[,2:7], k=5, alpha=.7, ndir="all", skew=T)
## Eigenvalues show variation in different components
cat("Total Variance in PC1 and PC2\n")
## Total Variance in PC1 and PC2
round(metals.pca[[2]]/sum(metals.pca[[2]]), 3) ## variation in PC's is as expected across components
## PC1 PC2 PC3 PC4 PC5
## 0.470 0.220 0.134 0.120 0.056
diagPlot(metals.pca)
The eigenvalues show that PC1 and PC2 exhibit the majority of the variance, PC 3-5 are smaller. In general, the diagnostic plot suggests that robust PCA is warranted
The loadings are shown below.
round(metals.pca$loadings, 3)
## PC1 PC2 PC3 PC4 PC5
## Mn55 0.581 0.064 0.601 -0.533 0.083
## Cu65 0.406 0.269 -0.775 -0.394 -0.035
## Cd111 0.197 0.001 -0.008 0.051 0.007
## Hg202 0.114 -0.036 -0.086 0.181 0.972
## Pb208 0.651 -0.002 0.005 0.708 -0.215
## U238 0.147 -0.960 -0.174 -0.154 -0.041
names(metals.allobs.log10[,2:7])
## [1] "Mn55" "Cu65" "Cd111" "Hg202" "Pb208" "U238"
##plot(metals.pca$loadings, pch=rownames(metals.pca$loadings))
There are a few of points to notice about the loadings
All components of PC1 have the same sign. As before, this suggests a “common” effect to all measurements. For example, all metals tend to be “high” or “low” together. May be caused either by global high-low exposure of inviduals, or by mass spectrometry measurement issues.
PC1 has highest loading for Pb208, Mn55, and Cu65
For PC2, U238 is the biggest driver.
Finally, we look at PCA scores by site. The second figure below shows clearly that site 2 has slightly more variation than site 3. But not bad. It appears that site 3 may be slightly greater on PC2 than is site 2.
##names(metals.allobs.log10)
metals.pca.df <- data.frame(Site=factor(metals.allobs.log10$Site), metals.pca$scores)
qplot(x=PC1, y=PC2, data=metals.pca.df, col=Site,
title="Robust PCA Scores by Site")
## Warning: Ignoring unknown parameters: title
qplot(x=PC1, y=PC2, data=metals.pca.df) + ggtitle("Robust PCA Scores by Site") +
facet_wrap(~ Site)
## dim(metals.pca$scores)
## dim(metals.allobs.log10)
## New df on 2021.09.06
metals.robpca <- cbind(metals.allobs.log10, metals.pca$scores)
## dim(metals.robpca)
### about here
qplot(x=PC1, y=Mn55, data=metals.robpca, col=as.factor(Site))
qplot(x=PC1, y=Pb208, data=metals.robpca, col=as.factor(Site))
qplot(x=PC2, y=U238, data=metals.robpca, col=as.factor(Site))
write.csv(metals.robpca, file="metals_Impute_log10_PCA_scores.csv", quote=F, row.names=F)
pca_No_Hg <- robpca(metals.allobs.log10[,c(2:4, 6, 7)], k=5, alpha=.7, ndir="all",
skew=T)
round(pca_No_Hg[[2]]/sum(pca_No_Hg[[2]]), 3) ## .48 .19 .11
## PC1 PC2 PC3 PC4 PC5
## 0.549 0.216 0.124 0.097 0.014
round(pca_No_Hg$loadings, 2)
## PC1 PC2 PC3 PC4 PC5
## Mn55 0.61 0.17 0.02 0.77 0.10
## Cu65 0.47 0.32 0.68 -0.47 0.03
## Cd111 0.14 -0.05 -0.01 0.03 -0.99
## Pb208 0.56 0.03 -0.70 -0.44 0.07
## U238 0.28 -0.93 0.21 -0.04 0.08
## dim(pca_No_Hg$scores) ## 206 5
metals.noHg.robpca <- cbind(metals.allobs.log10, pca_No_Hg$scores)
write.csv(metals.noHg.robpca, file="metals_NoHg_log10_PCA_scores.csv", quote=F, row.names=F)
##system("pwd")
rmarkdown::render("yuma-metals-20210821.Rmd", 'html_document')
Based on meeting with Drs. Dean Billheimer and Frank von Hippel on 9/2/2022, this general analysis will explore how PC1 and PC2 relate to the health outcome data from the survey conducted in Yuma, AZ in 2018.
\(Note:\) The following scripts must be run prior to completing this analysis: Data_Clean.Rmd, Yuma_Metals_Analysis.Rmd, Demographics.Rmd, and yuma-metals-20210821.Rmd. This will provide the accurate dataframes used in the following code.
#Pb isotopes were removed because they were all heavily collinear.
##Perhaps so much so that the isotopes were calculated based on one reference one.
##Kept Pb208.
#The classic vs. robust PCAs were different enough that Dean felt like keeping the robust version was worth it. Even after logging the data, they are still skewed.
#PC1 is all positively correlated, which shows that it is an analysis of overall exposure.
#PC2 shows which ones are correlated in the same +ve or -ve directions.
These counts include only those who selected “yes” as cases and only those who explicitly selected “no” as controls. Others who chose options such as “don’t know” or “prefer not to answer” or those who skipped the question were removed from these counts.
#Numbers for each outcome
Case_Count <- Metals %>%
summarise("Arthritis"=sum(arthritis==1, na.rm = T),
"Cancer"=sum(cancer==1, na.rm = T),
"Cancer - EMR"=sum(emr_cancer==1, na.rm = T),
"Goiter"=sum(goiter_recode==1, na.rm = T),
"Goiter - EMR"=sum(emr_goiter==1, na.rm = T),
"Thyroid Disease"=sum(thyroid_prob==1, na.rm = T),
"Thyroid Disease - EMR"=sum(emr_thyroid_problem==1, na.rm = T),
"Hypothyroidism"=sum(hypo_recode==1, na.rm = T),
"Hypothyroidism - EMR"= sum(emr_hypothyroidism==1, na.rm = T),
"Hyperthyroidism"=sum(hyper_recode==1, na.rm = T),
"Hyperthyroidism - EMR"= sum(emr_hyperthyroidism==1, na.rm = T),
"Thyroid Cancer"=sum(thy_cancer_recode==1, na.rm = T),
"Thyroid Cancer - EMR"=sum(emr_thyroid_cancer==1, na.rm = T),
"Diabetes Mellitus"=sum(dm_recode==1, na.rm = T),
"Diabetes Mellitus - EMR"=sum(emr_dm_recode==1, na.rm = T),
"Gout"=sum(gout==1, na.rm = T),
"Congenital Heart Disease (CHD)"=sum(chd==1, na.rm = T),
"Weak Kidney"=sum(weak_kidney==1, na.rm = T),
"Hypertension"=sum(HTN==1, na.rm = T),
"Stroke"=sum(stroke==1, na.rm = T),
"Angina"=sum(angnia==1, na.rm = T),
"Congestive Heart Failure (CHF)"=sum(chf==1, na.rm = T),
"Heart Attack"=sum(heart_attack==1, na.rm = T),
"Cholesterol"=sum(Cholesterol==1, na.rm = T),
"PCOS"=sum(pos_recode==1, na.rm = T),
"PCOS - EMR"=sum(emr_pos==1, na.rm = T),
"Undescended Testes"=sum(undescended==1, na.rm = T),
"Emphysema"=sum(emphysema==1, na.rm = T),
"Autism Spectrum Disorder - EMR"=sum(emr_asd==1, na.rm = T),
"Overweight"=sum(Overweight==1, na.rm = T),
"Obesity - EMR"=sum(emr_obesity==1, na.rm = T),
"Dental Health"=sum(Dental_health==4|Dental_health==5, na.rm = T))
Case_Count <- t(Case_Count)
colnames(Case_Count) <- c("Cases")
Control_Count <- Metals %>%
summarise("Arthritis"=sum(arthritis==2, na.rm = T),
"Cancer"=sum(cancer==2, na.rm = T),
"Cancer - EMR"=sum(emr_cancer==0, na.rm = T),
"Goiter"=sum(goiter_recode==0, na.rm = T),
"Goiter - EMR"=sum(emr_goiter==0, na.rm = T),
"Thyroid Disease"=sum(thyroid_prob==2, na.rm = T),
"Thyroid Disease - EMR"=sum(emr_thyroid_problem==0, na.rm = T),
"Hypothyroidism"=sum(hypo_recode==0, na.rm = T),
"Hypothyroidism - EMR"= sum(emr_hypothyroidism==0, na.rm = T),
"Hyperthyroidism"=sum(hyper_recode==0, na.rm = T),
"Hyperthyroidism - EMR"= sum(emr_hyperthyroidism==0, na.rm = T),
"Thyroid Cancer"=sum(thy_cancer_recode==0, na.rm = T),
"Thyroid Cancer - EMR"=sum(emr_thyroid_cancer==0, na.rm = T),
"Diabetes Mellitus"=sum(dm_recode==0, na.rm = T),
"Diabetes Mellitus - EMR"=sum(emr_dm_recode==0, na.rm = T),
"Gout"=sum(gout==2, na.rm = T),
"Congenital Heart Disease (CHD)"=sum(chd==2, na.rm = T),
"Weak Kidney"=sum(weak_kidney==2, na.rm = T),
"Hypertension"=sum(HTN==2, na.rm = T),
"Stroke"=sum(stroke==2, na.rm = T),
"Angina"=sum(angnia==2, na.rm = T),
"Congestive Heart Failure (CHF)"=sum(chf==2, na.rm = T),
"Heart Attack"=sum(heart_attack==2, na.rm = T),
"Cholesterol"=sum(Cholesterol==2, na.rm = T),
"PCOS"=sum(pos_recode==0, na.rm = T),
"PCOS - EMR"=sum(emr_pos==0, na.rm = T),
"Undescended Testes"=sum(undescended==2, na.rm = T),
"Emphysema"=sum(emphysema==2, na.rm = T),
"Autism Spectrum Disorder - EMR"=sum(emr_asd==0, na.rm = T),
"Overweight"=sum(Overweight==2, na.rm = T),
"Obesity - EMR"=sum(emr_obesity==0, na.rm = T),
"Dental Health"=sum((Dental_health==1|Dental_health==2|Dental_health==3), na.rm = T))
Control_Count <- t(Control_Count)
colnames(Control_Count) <- c("Controls")
Case_Control <- cbind(Case_Count, Control_Count)
Case_Control <- as.data.frame(Case_Control)
Case_Control$Total <- Case_Control$Cases+Case_Control$Controls
kable(Case_Control)
| Cases | Controls | Total | |
|---|---|---|---|
| Arthritis | 70 | 243 | 313 |
| Cancer | 23 | 293 | 316 |
| Cancer - EMR | 27 | 225 | 252 |
| Goiter | 9 | 298 | 307 |
| Goiter - EMR | 5 | 247 | 252 |
| Thyroid Disease | 142 | 176 | 318 |
| Thyroid Disease - EMR | 80 | 172 | 252 |
| Hypothyroidism | 92 | 201 | 293 |
| Hypothyroidism - EMR | 116 | 136 | 252 |
| Hyperthyroidism | 26 | 265 | 291 |
| Hyperthyroidism - EMR | 5 | 247 | 252 |
| Thyroid Cancer | 6 | 305 | 311 |
| Thyroid Cancer - EMR | 5 | 247 | 252 |
| Diabetes Mellitus | 44 | 137 | 181 |
| Diabetes Mellitus - EMR | 54 | 198 | 252 |
| Gout | 14 | 298 | 312 |
| Congenital Heart Disease (CHD) | 6 | 309 | 315 |
| Weak Kidney | 14 | 302 | 316 |
| Hypertension | 106 | 213 | 319 |
| Stroke | 7 | 309 | 316 |
| Angina | 8 | 307 | 315 |
| Congestive Heart Failure (CHF) | 7 | 307 | 314 |
| Heart Attack | 10 | 307 | 317 |
| Cholesterol | 133 | 182 | 315 |
| PCOS | 17 | 299 | 316 |
| PCOS - EMR | 2 | 250 | 252 |
| Undescended Testes | 2 | 307 | 309 |
| Emphysema | 4 | 311 | 315 |
| Autism Spectrum Disorder - EMR | 0 | 252 | 252 |
| Overweight | 165 | 157 | 322 |
| Obesity - EMR | 78 | 173 | 251 |
| Dental Health | 159 | 158 | 317 |
Note from Frank on 9/3/2022:
For the metals analysis, the health outcomes don’t look to me like outcomes likely to be related to metals exposure. It’s worthwhile to graph them to see if anything comes out, but I don’t think it will.
A couple of caveats: Research from laboratory animal studies show relationships between contaminant exposures and autism. I don’t think we have a big enough sample size to detect this, but it’s worth looking at.
I think you should compare our PC scores to BMI. We measured BMI so that should be available for every participant. Many contaminants are obesogenic.
Can we pull out the motor dysfunction health outcomes from our health survey and electronic medical records? I’m thinking of diseases such as Parkinson’s or anything else with a motor deficit. These can be caused by high exposure to metals.
colnames(metals.robpca) <- c("ID", "Ln_Mn55", "Ln_Cu65", "Ln_Cd111", "Ln_Hg202", "Ln_Pb208", "Ln_U238", "Site", "PC1", "PC2", "PC3", "PC4", "PC5")
Metals <- merge(Metals, metals.robpca, by=c("ID", "Site"))
colnames(Metals) #will need to check to see if any of the col names need to change (e.g. instead of PC1 it is listed as PC1.x or PC1.y).
## [1] "ID" "Site"
## [3] "Mn55" "Cu65"
## [5] "Cd111" "Hg202"
## [7] "Pb208" "U238"
## [9] "Gender" "DOB_month"
## [11] "DOB_day" "DOB_year"
## [13] "Ethnicity" "Race_ai_an"
## [15] "Race_asian" "Race_black"
## [17] "Race_pac_isl" "Race_white"
## [19] "Race_other" "Race_dk"
## [21] "Race_refuse" "Race_other_TEXT"
## [23] "Marital_status" "Num_in_household"
## [25] "Num_children" "Child_under_18"
## [27] "Education" "Education_TEXT"
## [29] "Birth_country" "Birth_country_TEXT"
## [31] "Armed_forces" "Army"
## [33] "Navy" "Marine_corp"
## [35] "Air_force" "Coast_guard"
## [37] "Other" "Armed_forces_dk"
## [39] "Armed_forces_refuse" "Armed_forces_TEXT"
## [41] "Work_for_pay" "Self_employed"
## [43] "Looking" "Laid_off"
## [45] "Retired" "Homemaker"
## [47] "Student" "Disabled"
## [49] "Maternity_sick_leave" "employ_other"
## [51] "employ_other_A" "employ_dk"
## [53] "employ_TEXT" "Occupation"
## [55] "Farmer" "Pesticides"
## [57] "Lived_in_Yuma" "Born_in_Yuma"
## [59] "Yuma_pregnant_mom" "Residence_type"
## [61] "Residence_TEXT" "Own_rent"
## [63] "Own_rent_TEXT" "Homeless"
## [65] "Water_home" "Water_home_TEXT"
## [67] "Water_cooking" "Water_cooking_TEXT"
## [69] "Water_coffee" "Water_coffee_TEXT"
## [71] "Water_work" "Water_work_TEXT"
## [73] "Health_care" "Health_care_source"
## [75] "Health_care_source_TEXT" "Healthcare_lastyear"
## [77] "Health_consult" "Health_consult_place"
## [79] "Health_consult_place_TEXT" "Routine_care"
## [81] "Routine_care_TEXT" "Doctor_time"
## [83] "ER_visits" "afford_rx_meds"
## [85] "afford_mh_care" "afford_dental_care"
## [87] "afford_eyeglasses" "afford_specialist"
## [89] "afford_follow_up" "Delay_telephone"
## [91] "Delay_appt" "Delay_wait"
## [93] "Delay_open" "Delay_transportation"
## [95] "Mental_health" "General_health"
## [97] "Health_compare" "Height_ft"
## [99] "Height_inch" "Weight"
## [101] "Overweight" "Dental_visit"
## [103] "Dental_health" "arthritis"
## [105] "gout" "chf"
## [107] "chd" "angnia"
## [109] "heart_attack" "stroke"
## [111] "emphysema" "thyroid_prob"
## [113] "hyperthyroid" "hypothyroid"
## [115] "thyroid_cancer" "goiter"
## [117] "weak_kidney" "cancer"
## [119] "pos" "undescended"
## [121] "ambiguous" "HTN"
## [123] "HTN_hx_meds" "HTN_current_med"
## [125] "Cholesterol" "Chol_med"
## [127] "Diabetes" "DM_age"
## [129] "DM_age_TEXT" "insulin"
## [131] "diabetic_pills" "Income"
## [133] "age" "thyroid_prob_D"
## [135] "Hyper_D" "Hypo_D"
## [137] "Thyroid_cancer_D" "Goiter_D"
## [139] "thyroid_conditions" "clinical"
## [141] "endo" "fT3"
## [143] "TSH" "fT4"
## [145] "TotalT4" "TotalT3"
## [147] "Cortisol" "latitude"
## [149] "longitude" "emr_height_inches"
## [151] "emr_weight" "emr_bmi"
## [153] "emr_thyroid_problem" "emr_hyperthyroidism"
## [155] "emr_hypothyroidism" "emr_thyroid_cancer"
## [157] "emr_goiter" "emr_cancer"
## [159] "emr_pos" "emr_diabetes"
## [161] "emr_obesity" "sc"
## [163] "emr_sud" "emr_asd"
## [165] "meds" "meds_name"
## [167] "thyroid_med" "free_t3_med"
## [169] "tsh_med" "total_t4_med"
## [171] "free_t4_med" "total_t3_med"
## [173] "cortisol_med" "Fe57"
## [175] "As75" "height_total"
## [177] "bmi" "age_group"
## [179] "Site_Collapsed" "hyper_recode"
## [181] "hypo_recode" "thy_cancer_recode"
## [183] "goiter_recode" "pos_recode"
## [185] "dm_recode" "emr_dm_recode"
## [187] "sud" "tobacco_history"
## [189] "number_years" "pack_per_day"
## [191] "quit_date" "tremor"
## [193] "FAC1_1" "FAC2_1"
## [195] "FAC3_1" "FAC4_1"
## [197] "FAC5_1" "Date"
## [199] "DOB" "Ln_Mn55"
## [201] "Ln_Cu65" "Ln_Cd111"
## [203] "Ln_Hg202" "Ln_Pb208"
## [205] "Ln_U238" "PC1"
## [207] "PC2" "PC3"
## [209] "PC4" "PC5"
#BMI looks to be skewed (lognormally distributed)
hist(log(Metals$bmi))
hist(log(Metals$emr_bmi))
ggplot(Metals, aes(x=log(bmi), y=PC1))+
geom_point(aes(color=as.factor(Site)))+
geom_smooth(method=lm, se=F)+
scale_color_discrete("Site", labels=c("1"="CSF", "2"="RCBH", "3"="YRMC"))+
xlab("\nBMI")+
theme_bw()
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 8 rows containing non-finite values (stat_smooth).
## Warning: Removed 8 rows containing missing values (geom_point).
cor.test(Metals$PC1, log(Metals$bmi), method = "pearson") #is this the correct test to use?
##
## Pearson's product-moment correlation
##
## data: Metals$PC1 and log(Metals$bmi)
## t = 0.98045, df = 196, p-value = 0.3281
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.07026461 0.20728357
## sample estimates:
## cor
## 0.06986136
ggplot(Metals, aes(x=log(bmi), y=PC2))+
geom_point(aes(color=as.factor(Site)))+
geom_smooth(method=lm, se=F)+
scale_color_discrete("Site", labels=c("1"="CSF", "2"="RCBH", "3"="YRMC"))+
xlab("\nBMI")+
theme_bw()
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 8 rows containing non-finite values (stat_smooth).
## Removed 8 rows containing missing values (geom_point).
cor.test(Metals$PC2, log(Metals$bmi), method = "pearson")
##
## Pearson's product-moment correlation
##
## data: Metals$PC2 and log(Metals$bmi)
## t = 1.643, df = 196, p-value = 0.102
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.0232653 0.2519017
## sample estimates:
## cor
## 0.1165543
#Looking at binary variables for weight / obesity
ggplot(subset(Metals, Overweight==1|Overweight==2), aes(x=as.factor(Overweight), y=PC1, fill=as.factor(Site)))+
geom_boxplot()+
scale_fill_discrete("Site", labels=c("1"="CSF", "2"="RCBH", "3"="YRMC"))+
scale_x_discrete(labels=c("1"="Yes", "2"="No"))+
xlab("\nOverweight by Site")+
theme_bw()
ggplot(subset(Metals, Overweight==1|Overweight==2), aes(x=as.factor(Overweight), y=PC1))+
geom_boxplot()+
scale_x_discrete(labels=c("1"="Yes", "2"="No"))+
xlab("\nOverweight")+
theme_bw()
ggplot(subset(Metals, Overweight==1|Overweight==2), aes(x=as.factor(Overweight), y=PC2, fill=as.factor(Site)))+
geom_boxplot()+
scale_fill_discrete("Site", labels=c("1"="CSF", "2"="RCBH", "3"="YRMC"))+
scale_x_discrete(labels=c("1"="Yes", "2"="No"))+
xlab("\nOverweight by Site")+
theme_bw()
ggplot(subset(Metals, Overweight==1|Overweight==2), aes(x=as.factor(Overweight), y=PC2))+
geom_boxplot()+
scale_x_discrete(labels=c("1"="Yes", "2"="No"))+
xlab("\nOverweight")+
theme_bw()
ggplot(subset(Metals, emr_obesity==1|emr_obesity==0), aes(x=as.factor(emr_obesity), y=PC1, fill=as.factor(Site)))+
geom_boxplot()+
scale_fill_discrete("Site", labels=c("1"="CSF", "2"="RCBH", "3"="YRMC"))+
scale_x_discrete(labels=c("1"="Yes", "0"="No"))+
xlab("\nObesity (EMR)")+
theme_bw()
ggplot(subset(Metals, emr_obesity==1|emr_obesity==0), aes(x=as.factor(emr_obesity), y=PC2, fill=as.factor(Site)))+
geom_boxplot()+
scale_fill_discrete("Site", labels=c("1"="CSF", "2"="RCBH", "3"="YRMC"))+
scale_x_discrete(labels=c("1"="Yes", "0"="No"))+
xlab("\nObesity (EMR)")+
theme_bw()
ggplot(subset(Metals, arthritis==1|arthritis==2), aes(x=as.factor(arthritis), y=PC1, fill=as.factor(Site)))+
geom_boxplot()+
scale_fill_discrete("Site", labels=c("1"="CSF", "2"="RCBH", "3"="YRMC"))+
scale_x_discrete(labels=c("1"="Yes", "2"="No"))+
xlab("\nArthritis by Site")+
theme_bw()
ggplot(subset(Metals, arthritis==1|arthritis==2), aes(x=as.factor(arthritis), y=PC1))+
geom_boxplot()+
scale_x_discrete(labels=c("1"="Yes", "2"="No"))+
xlab("\nArthritis")+
theme_bw()
ggplot(subset(Metals, arthritis==1|arthritis==2), aes(x=as.factor(arthritis), y=PC2, fill=as.factor(Site)))+
geom_boxplot()+
scale_fill_discrete("Site", labels=c("1"="CSF", "2"="RCBH", "3"="YRMC"))+
scale_x_discrete(labels=c("1"="Yes", "2"="No"))+
xlab("\nArthritis")+
theme_bw()
ggplot(subset(Metals, cancer==1|cancer==2), aes(x=as.factor(cancer), y=PC1, fill=as.factor(Site)))+
geom_boxplot()+
scale_fill_discrete("Site", labels=c("1"="CSF", "2"="RCBH", "3"="YRMC"))+
scale_x_discrete(labels=c("1"="Yes", "2"="No"))+
xlab("\nCancers")+
theme_bw()
ggplot(subset(Metals, cancer==1|cancer==2), aes(x=as.factor(cancer), y=PC2, fill=as.factor(Site)))+
geom_boxplot()+
scale_fill_discrete("Site", labels=c("1"="CSF", "2"="RCBH", "3"="YRMC"))+
scale_x_discrete(labels=c("1"="Yes", "2"="No"))+
xlab("\nCancer")+
theme_bw()
ggplot(subset(Metals, dm_recode==1|dm_recode==0), aes(x=as.factor(dm_recode), y=PC1, fill=as.factor(Site)))+
geom_boxplot()+
scale_fill_discrete("Site", labels=c("1"="CSF", "2"="RCBH", "3"="YRMC"))+
scale_x_discrete(labels=c("1"="Yes", "0"="No"))+
xlab("\nDiabetes Mellitus")+
theme_bw()
ggplot(subset(Metals, dm_recode==1|dm_recode==0), aes(x=as.factor(dm_recode), y=PC2, fill=as.factor(Site)))+
geom_boxplot()+
scale_fill_discrete("Site", labels=c("1"="CSF", "2"="RCBH", "3"="YRMC"))+
scale_x_discrete(labels=c("1"="Yes", "0"="No"))+
xlab("\nDiabetes Mellitus")+
theme_bw()
ggplot(subset(Metals, Cholesterol==1|Cholesterol==2), aes(x=as.factor(Cholesterol), y=PC1, fill=as.factor(Site)))+
geom_boxplot()+
scale_fill_discrete("Site", labels=c("1"="CSF", "2"="RCBH", "3"="YRMC"))+
scale_x_discrete(labels=c("1"="Yes", "2"="No"))+
xlab("\nCholesterol")+
theme_bw()
ggplot(subset(Metals, Cholesterol==1|Cholesterol==2), aes(x=as.factor(Cholesterol), y=PC2, fill=as.factor(Site)))+
geom_boxplot()+
scale_fill_discrete("Site", labels=c("1"="CSF", "2"="RCBH", "3"="YRMC"))+
scale_x_discrete(labels=c("1"="Yes", "2"="No"))+
xlab("\nCholesterol")+
theme_bw()
ggplot(subset(Metals, Dental_health==1|Dental_health==2|Dental_health==3|Dental_health==4|Dental_health==5), aes(x=as.factor(Dental_health), y=PC1))+
geom_boxplot()+
scale_x_discrete(labels=c("1"="Excellent", "2"="Very Good", "3"="Good", "4"="Fair", "5"="Poor"))+
xlab("\nDental Health")+
theme_bw()
ggplot(subset(Metals, Dental_health==1|Dental_health==2|Dental_health==3|Dental_health==4|Dental_health==5), aes(x=as.factor(Dental_health), y=PC2))+
geom_boxplot()+
scale_x_discrete(labels=c("1"="Excellent", "2"="Very Good", "3"="Good", "4"="Fair", "5"="Poor"))+
xlab("\nDental Health")+
theme_bw()
\(Note:\) The following scripts must be run prior to completing this analysis: Data_Clean.Rmd, Yuma_Metals_Analysis.Rmd, Demographics.Rmd, yuma-metals-20210821.Rmd, and Exploration.Rmd. This will provide the accurate dataframes used in the following code.
#create_report(Metals) (Remove the # when you actually want to run - takes too much processing power otherwise.)
#There is a single data point for the year 2013, with the rest being April-November 2018.
#This is likely either a mistake in input or some other error, but I will not include this point in subsequent analysis.
#The concentration values for this single row are blank anyway.
ggplot(Metals, aes(y=PC1, x=Date, color=as.factor(Site)))+
geom_point()+
geom_smooth(se=F)+
scale_color_discrete("Site", labels=c("1"="CSF", "2"="RCBH", "3"="YRMC"))+
theme(panel.background = element_rect(fill="#ffffff"),
axis.text.x=element_text(angle=60, hjust=1),
axis.line = element_line(color="#000000"))+
scale_x_date(date_breaks = "1 month", date_labels = "%b")+
ylab("PC1")+
xlab("Date of Sample Collection")
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## Warning: Removed 4 rows containing non-finite values (stat_smooth).
## Warning: Removed 4 rows containing missing values (geom_point).
ggplot(Metals, aes(y=PC2, x=Date, color=as.factor(Site)))+
geom_point()+
geom_smooth(se=F)+
scale_color_discrete("Site", labels=c("1"="CSF", "2"="RCBH", "3"="YRMC"))+
theme(panel.background = element_rect(fill="#ffffff"),
axis.text.x=element_text(angle=60, hjust=1),
axis.line = element_line(color="#000000"))+
scale_x_date(date_breaks = "1 month", date_labels = "%b")+
ylab("PC2")+
xlab("Date of Sample Collection")
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## Warning: Removed 4 rows containing non-finite values (stat_smooth).
## Removed 4 rows containing missing values (geom_point).
#Relevant paper: https://www.sciencedirect.com/science/article/pii/S1878535212000263
Metals %>%
group_by(Site) %>%
summarize(number_NAs_PC1=sum(is.na(PC1)),
number_NAs_PC2=sum(is.na(PC2)))
## # A tibble: 3 × 3
## Site number_NAs_PC1 number_NAs_PC2
## <dbl> <int> <int>
## 1 1 0 0
## 2 2 0 0
## 3 3 0 0
#Making a table re: PC values
knitr::kable(Metals %>%
group_by(Site) %>%
summarize(n_PC1=sum(!is.na(PC1)),
geomean_PC1=round(exp(mean(PC1, na.rm = T)), digits = 2),
geosd_PC1=round(exp(sd(PC1, na.rm = T)), digits = 2),
n_PC2=sum(!is.na(PC2)),
geomean_PC2=round(exp(mean(PC2, na.rm = T)), digits = 2),
geosd_PC2=round(exp(sd(PC2, na.rm = T)), digits = 2)))
| Site | n_PC1 | geomean_PC1 | geosd_PC1 | n_PC2 | geomean_PC2 | geosd_PC2 |
|---|---|---|---|---|---|---|
| 1 | 27 | 1.11 | 2.32 | 27 | 1.31 | 1.41 |
| 2 | 101 | 1.56 | 2.55 | 101 | 1.08 | 1.54 |
| 3 | 78 | 0.79 | 1.90 | 78 | 0.76 | 1.74 |
#create_report(data.melt) (Only run if you need to.)
hist(Metals$PC1)
hist(Metals$PC2)
#Cortisol against PCA results
PC1_Cortisol <- ggplot(Metals, aes(x=PC1, y=Cortisol))+geom_point()
PC2_Cortisol <- ggplot(Metals, aes(x=PC2, y=Cortisol))+geom_point()
grid.arrange(PC1_Cortisol, PC2_Cortisol)
## Warning: Removed 6 rows containing missing values (geom_point).
## Removed 6 rows containing missing values (geom_point).